{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import pickle\n", "from celluloid import Camera\n", "import matplotlib.pyplot as plt\n", "from tqdm.notebook import tqdm\n", "import torch\n", "from torch import nn\n", "from torch.nn import functional as F\n", "from torch.utils.data import DataLoader, Dataset\n", "import numpy as np\n", "from typing import List\n", "import easydict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 데이터 불러오기" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsensor_00sensor_01sensor_02sensor_03sensor_04sensor_05sensor_06sensor_07sensor_08...sensor_43sensor_44sensor_45sensor_46sensor_47sensor_48sensor_49sensor_50sensor_51machine_status
02018-04-01 00:00:002.46539447.0920153.211846.310760634.375076.4597513.4114616.1313615.56713...41.9270839.64120065.6828750.9259338.194440157.986167.70834243.0556201.3889NORMAL
12018-04-01 00:01:002.46539447.0920153.211846.310760634.375076.4597513.4114616.1313615.56713...41.9270839.64120065.6828750.9259338.194440157.986167.70834243.0556201.3889NORMAL
22018-04-01 00:02:002.44473447.3524353.211846.397570638.888973.5459813.3246516.0373315.61777...41.6666639.35185265.3935251.2152838.194443155.960667.12963241.3194203.7037NORMAL
32018-04-01 00:03:002.46047447.0920153.168446.397568628.125076.9889813.3174216.2471115.69734...40.8854139.06250064.8148151.2152838.194440155.960666.84028240.4514203.1250NORMAL
42018-04-01 00:04:002.44571847.1354153.211846.397568636.458376.5889713.3535916.2109415.69734...41.4062538.77315065.1041651.7939838.773150158.275566.55093242.1875201.3889NORMAL
\n", "

5 rows × 54 columns

\n", "
" ], "text/plain": [ " timestamp sensor_00 sensor_01 sensor_02 sensor_03 sensor_04 \\\n", "0 2018-04-01 00:00:00 2.465394 47.09201 53.2118 46.310760 634.3750 \n", "1 2018-04-01 00:01:00 2.465394 47.09201 53.2118 46.310760 634.3750 \n", "2 2018-04-01 00:02:00 2.444734 47.35243 53.2118 46.397570 638.8889 \n", "3 2018-04-01 00:03:00 2.460474 47.09201 53.1684 46.397568 628.1250 \n", "4 2018-04-01 00:04:00 2.445718 47.13541 53.2118 46.397568 636.4583 \n", "\n", " sensor_05 sensor_06 sensor_07 sensor_08 ... sensor_43 sensor_44 \\\n", "0 76.45975 13.41146 16.13136 15.56713 ... 41.92708 39.641200 \n", "1 76.45975 13.41146 16.13136 15.56713 ... 41.92708 39.641200 \n", "2 73.54598 13.32465 16.03733 15.61777 ... 41.66666 39.351852 \n", "3 76.98898 13.31742 16.24711 15.69734 ... 40.88541 39.062500 \n", "4 76.58897 13.35359 16.21094 15.69734 ... 41.40625 38.773150 \n", "\n", " sensor_45 sensor_46 sensor_47 sensor_48 sensor_49 sensor_50 \\\n", "0 65.68287 50.92593 38.194440 157.9861 67.70834 243.0556 \n", "1 65.68287 50.92593 38.194440 157.9861 67.70834 243.0556 \n", "2 65.39352 51.21528 38.194443 155.9606 67.12963 241.3194 \n", "3 64.81481 51.21528 38.194440 155.9606 66.84028 240.4514 \n", "4 65.10416 51.79398 38.773150 158.2755 66.55093 242.1875 \n", "\n", " sensor_51 machine_status \n", "0 201.3889 NORMAL \n", "1 201.3889 NORMAL \n", "2 203.7037 NORMAL \n", "3 203.1250 NORMAL \n", "4 201.3889 NORMAL \n", "\n", "[5 rows x 54 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## 데이터 불러오기\n", "df = pd.read_csv('sensor.csv', index_col=0)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5ffaee7333f04ffe8e0afde1890ef3d0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=52.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "MovieWriter ffmpeg unavailable; using Pillow instead.\n", "c:\\program files\\python37\\lib\\site-packages\\ipykernel_launcher.py:43: MatplotlibDeprecationWarning: savefig() got unexpected keyword argument \"frameon\" which is no longer supported as of 3.3 and will become an error two minor releases later\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "def plot_sensor(temp_df, save_path='sample.gif'):\n", " fig = plt.figure(figsize=(16, 6))\n", " ## 에니메이션 만들기\n", " camera = Camera(fig)\n", " ax=fig.add_subplot(111)\n", " \n", " ## 불량 구간 탐색 데이터\n", " labels = temp_df['machine_status'].values.tolist()\n", " dates = temp_df.index\n", " \n", " for var_name in tqdm([item for item in df.columns if 'sensor_' in item]):\n", " temp_df[var_name].plot(ax=ax)\n", " ax.legend([var_name], loc='upper right')\n", " \n", " ## 고장구간 표시\n", " temp_start = dates[0]\n", " temp_date = dates[0]\n", " temp_label = labels[0]\n", " \n", " for xc, value in zip(dates, labels):\n", " if temp_label != value:\n", " if temp_label == \"BROKEN\":\n", " ax.axvspan(temp_start, temp_date, alpha=0.2, color='blue')\n", " if temp_label == \"RECOVERING\":\n", " ax.axvspan(temp_start, temp_date, alpha=0.2, color='orange')\n", " temp_start=xc\n", " temp_label=value\n", " temp_date = xc\n", " if temp_label == \"BROKEN\":\n", " ax.axvspan(temp_start, xc, alpha=0.2, color='blue')\n", " if temp_label == \"RECOVERING\":\n", " ax.axvspan(temp_start, xc, alpha=0.2, color='orange')\n", " ## 카메라 찍기\n", " camera.snap()\n", " \n", " animation = camera.animate(500, blit=True)\n", " # .gif 파일로 저장하면 끝!\n", " animation.save(\n", " save_path,\n", " dpi=100,\n", " savefig_kwargs={\n", " 'frameon': False,\n", " 'pad_inches': 'tight'\n", " }\n", " )\n", "plot_sensor(df, 'merge.gif')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 데이터 전처리" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "## 데이터 Type 변경\n", "df['date'] = pd.to_datetime(df['timestamp'])\n", "for var_index in [item for item in df.columns if 'sensor_' in item]:\n", " df[var_index] = pd.to_numeric(df[var_index], errors='coerce')\n", "del df['timestamp']\n", "\n", "## 날짜로 sorting\n", "df = df.set_index('date')\n", "df = df.reset_index()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "## 결측 변수 확인\n", "(df.isnull().sum()/len(df)).plot.bar(figsize=(18, 8), colormap='Paired')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "## 중복된 데이터를 삭제합니다.\n", "df = df.drop_duplicates()\n", "## 센서 15번, 센서 50 은 삭제합\n", "del df['sensor_15']\n", "del df['sensor_50']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "## 이전 시점의 데이터로 보간\n", "df = df.fillna(method='ffill')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['NORMAL', 'BROKEN', 'RECOVERING'], dtype=object)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['machine_status'].unique()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "normal_df = df[df['machine_status']=='NORMAL']\n", "abnormal_df = df[df['machine_status']!='NORMAL']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 데이터 분리 " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "normal_df = df[df['machine_status']=='NORMAL']\n", "abnormal_df = df[df['machine_status']!='NORMAL']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "## 시계열 데이터이고, 입력의 형태가 특정 길이(window size)의 sequence 데이터 이므로 shuffle은 사용하지 않습니다.\n", "## Normal 데이터는 학습데이터, 파라미터 설정데이터, 검증용데이터, 실험용데이터의 비율을 7:1:1:1 로 나누어서 사용합니다.\n", "\n", "interval_n = int(len(normal_df)/10)\n", "normal_df1 = df.iloc[0:interval_n*7]\n", "normal_df2 = df.iloc[interval_n*7:interval_n*8]\n", "normal_df3 = df.iloc[interval_n*8:interval_n*9]\n", "normal_df4 = df.iloc[interval_n*9:]\n", "\n", "## abnormal 데이터는 검증용데이터, 실험용데이터의 비율을 5:5 로 나누어서 사용합니다.\n", "interval_ab = int(len(abnormal_df)/2)\n", "abnormal_df1 = df.iloc[0:interval_ab]\n", "abnormal_df2 = df.iloc[interval_ab:]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\program files\\python37\\lib\\site-packages\\ipykernel_launcher.py:2: FutureWarning: DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.\n", " \n" ] } ], "source": [ "## 데이터 정규화를 위하여 분산 및 평균 추출\n", "mean_df = normal_df1.mean()\n", "std_df = normal_df1.std()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 데이터 셋 구조" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "## 데이터를 불러올 때 index로 불러오기\n", "def make_data_idx(dates, window_size=1):\n", " input_idx = []\n", " for idx in range(window_size-1, len(dates)):\n", " cur_date = dates[idx].to_pydatetime()\n", " in_date = dates[idx - (window_size-1)].to_pydatetime()\n", " \n", " _in_period = (cur_date - in_date).days * 24 * 60 + (cur_date - in_date).seconds / 60\n", " \n", " ## 각 index가 1분 간격으로 떨어져 있는지를 확인합니다.\n", " if _in_period == (window_size-1):\n", " input_idx.append(list(range(idx - window_size+1, idx+1)))\n", " return input_idx" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "## Dataset을 상속받아 데이터를 구성\n", "class TagDataset(Dataset):\n", " def __init__(self, input_size, df, mean_df=None, std_df = None, window_size=1):\n", " \n", " ## 변수 갯수\n", " self.input_size = input_size\n", " \n", " ## 복원할 sequence 길이\n", " self.window_size = window_size\n", " \n", " ## Summary용 데이터 Deep copy\n", " original_df = df.copy()\n", " \n", " ## 정규화\n", " if mean_df is not None and std_df is not None:\n", " sensor_columns = [item for item in df.columns if 'sensor_' in item]\n", " df[sensor_columns] = (df[sensor_columns]-mean_df)/std_df\n", " \n", " ## 연속한 index를 기준으로 학습에 사용합니다.\n", " dates = list(df['date'])\n", " self.input_ids = make_data_idx(dates, window_size=window_size)\n", " \n", " ## sensor 데이터만 사용하여 reconstruct에 활용\n", " self.selected_column = [item for item in df.columns if 'sensor_' in item][:input_size]\n", " self.var_data = torch.tensor(df[self.selected_column].values, dtype=torch.float)\n", " \n", " ## Summary 용\n", " self.df = original_df.iloc[np.array(self.input_ids)[:, -1]]\n", " \n", " ## Dataset은 반드시 __len__ 함수를 만들어줘야함(데이터 길이)\n", " def __len__(self):\n", " return len(self.input_ids)\n", " \n", " ## Dataset은 반드시 __getitem__ 함수를 만들어줘야함\n", " ## torch 모듈은 __getitem__ 을 호출하여 학습할 데이터를 불러옴.\n", " def __getitem__(self, item):\n", " temp_input_ids = self.input_ids[item]\n", " input_values = self.var_data[temp_input_ids]\n", " return input_values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 모델 만들기" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "## 인코더\n", "class Encoder(nn.Module):\n", "\n", " def __init__(self, input_size=4096, hidden_size=1024, num_layers=2):\n", " super(Encoder, self).__init__()\n", " self.hidden_size = hidden_size\n", " self.num_layers = num_layers\n", " self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,\n", " dropout=0.1, bidirectional=False)\n", "\n", " def forward(self, x):\n", " outputs, (hidden, cell) = self.lstm(x) # out: tensor of shape (batch_size, seq_length, hidden_size)\n", "\n", " return (hidden, cell)\n", " \n", "## 디코더\n", "class Decoder(nn.Module):\n", "\n", " def __init__(self, input_size=4096, hidden_size=1024, output_size=4096, num_layers=2):\n", " super(Decoder, self).__init__()\n", " self.hidden_size = hidden_size\n", " self.output_size = output_size\n", " self.num_layers = num_layers\n", "\n", " self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,\n", " dropout=0.1, bidirectional=False)\n", "\n", " self.relu = nn.ReLU()\n", " self.fc = nn.Linear(hidden_size, output_size)\n", " \n", " def forward(self, x, hidden):\n", " output, (hidden, cell) = self.lstm(x, hidden) # out: tensor of shape (batch_size, seq_length, hidden_size)\n", " prediction = self.fc(output)\n", "\n", " return prediction, (hidden, cell)\n", " \n", "## LSTM Auto Encoder\n", "class LSTMAutoEncoder(nn.Module):\n", "\n", " def __init__(self,\n", " input_dim: int,\n", " latent_dim: int,\n", " window_size: int=1,\n", " **kwargs) -> None:\n", " \"\"\"\n", " :param input_dim: 변수 Tag 갯수\n", " :param latent_dim: 최종 압축할 차원 크기\n", " :param window_size: 길이\n", " :param kwargs:\n", " \"\"\"\n", "\n", " super(LSTMAutoEncoder, self).__init__()\n", "\n", " self.latent_dim = latent_dim\n", " self.input_dim = input_dim\n", " self.window_size = window_size\n", "\n", " if \"num_layers\" in kwargs:\n", " num_layers = kwargs.pop(\"num_layers\")\n", " else:\n", " num_layers = 1\n", "\n", " self.encoder = Encoder(\n", " input_size=input_dim,\n", " hidden_size=latent_dim,\n", " num_layers=num_layers,\n", " )\n", " self.reconstruct_decoder = Decoder(\n", " input_size=input_dim,\n", " output_size=input_dim,\n", " hidden_size=latent_dim,\n", " num_layers=num_layers,\n", " )\n", "\n", " def forward(self, src:torch.Tensor, **kwargs):\n", " batch_size, sequence_length, var_length = src.size()\n", "\n", " ## Encoder 넣기\n", " encoder_hidden = self.encoder(src)\n", " \n", " inv_idx = torch.arange(sequence_length - 1, -1, -1).long()\n", " reconstruct_output = []\n", " temp_input = torch.zeros((batch_size, 1, var_length), dtype=torch.float).to(src.device)\n", " hidden = encoder_hidden\n", " for t in range(sequence_length):\n", " temp_input, hidden = self.reconstruct_decoder(temp_input, hidden)\n", " reconstruct_output.append(temp_input)\n", " reconstruct_output = torch.cat(reconstruct_output, dim=1)[:, inv_idx, :]\n", " \n", " return [reconstruct_output, src]\n", "\n", " def loss_function(self,\n", " *args,\n", " **kwargs) -> dict:\n", " recons = args[0]\n", " input = args[1]\n", " \n", " ## MSE loss(Mean squared Error)\n", " loss =F.mse_loss(recons, input)\n", " return loss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 학습 구성" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def run(args, model, train_loader, test_loader):\n", " # optimizer 설정\n", " optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)\n", "\n", " ## 반복 횟수 Setting\n", " epochs = tqdm(range(args.max_iter//len(train_loader)+1))\n", " \n", " ## 학습하기\n", " count = 0\n", " best_loss = 100000000\n", " for epoch in epochs:\n", " model.train()\n", " optimizer.zero_grad()\n", " train_iterator = tqdm(enumerate(train_loader), total=len(train_loader), desc=\"training\")\n", "\n", " for i, batch_data in train_iterator:\n", " \n", " if count > args.max_iter:\n", " return model\n", " count += 1\n", " \n", " batch_data = batch_data.to(args.device)\n", " predict_values = model(batch_data)\n", " loss = model.loss_function(*predict_values)\n", "\n", " # Backward and optimize\n", " loss.backward()\n", " optimizer.step()\n", " optimizer.zero_grad()\n", " \n", " train_iterator.set_postfix({\n", " \"train_loss\": float(loss),\n", " })\n", "\n", " model.eval()\n", " eval_loss = 0\n", " test_iterator = tqdm(enumerate(test_loader), total=len(test_loader), desc=\"testing\")\n", " with torch.no_grad():\n", " for i, batch_data in test_iterator:\n", " \n", " batch_data = batch_data.to(args.device)\n", " predict_values = model(batch_data)\n", " loss = model.loss_function(*predict_values)\n", "\n", " eval_loss += loss.mean().item()\n", "\n", " test_iterator.set_postfix({\n", " \"eval_loss\": float(loss),\n", " })\n", " eval_loss = eval_loss / len(test_loader)\n", " epochs.set_postfix({\n", " \"Evaluation Score\": float(eval_loss),\n", " })\n", " if eval_loss < best_loss:\n", " best_loss = eval_loss\n", " else:\n", " if args.early_stop:\n", " print('early stop condition best_loss[{}] eval_loss[{}]'.format(best_loss, eval_loss))\n", " return model\n", " \n", " return model\n", "\n", "def get_loss_list(args, model, test_loader):\n", " test_iterator = tqdm(enumerate(test_loader), total=len(test_loader), desc=\"testing\")\n", " loss_list = []\n", " \n", " with torch.no_grad():\n", " for i, batch_data in test_iterator:\n", " \n", " batch_data = batch_data.to(args.device)\n", " predict_values = model(batch_data)\n", " \n", " ## MAE(Mean Absolute Error)로 계산\n", " loss = F.l1_loss(predict_values[0], predict_values[1], reduce=False)\n", " #loss = loss.sum(dim=2).sum(dim=1).cpu().numpy()\n", " loss = loss.mean(dim=1).cpu().numpy()\n", " loss_list.append(loss)\n", " loss_list = np.concatenate(loss_list, axis=0)\n", " return loss_list\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 모델 & 학습 파라미터 설정" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "## 설정 폴더\n", "args = easydict.EasyDict({\n", " \"batch_size\": 128, ## 배치 사이즈 설정\n", " \"device\": torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'), ## GPU 사용 여부 설정\n", " \"input_size\": 40, ## 입력 차원 설정\n", " \"latent_size\": 10, ## Hidden 차원 설정\n", " \"output_size\": 40, ## 출력 차원 설정\n", " \"window_size\" : 3, ## sequence Lenght\n", " \"num_layers\": 2, ## LSTM layer 갯수 설정\n", " \"learning_rate\" : 0.001, ## learning rate 설정\n", " \"max_iter\" : 100000, ## 총 반복 횟수 설정\n", " 'early_stop' : True, ## valid loss가 작아지지 않으면 early stop 조건 설정\n", "})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 학습하기" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\program files\\python37\\lib\\site-packages\\pandas\\core\\frame.py:3062: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " self[k1] = value[k2]\n" ] } ], "source": [ "## 데이터셋으로 변환\n", "normal_dataset1 = TagDataset(df=normal_df1, input_size=args.input_size, window_size=args.window_size, mean_df=mean_df, std_df=std_df)\n", "normal_dataset2 = TagDataset(df=normal_df2, input_size=args.input_size, window_size=args.window_size, mean_df=mean_df, std_df=std_df)\n", "normal_dataset3 = TagDataset(df=normal_df3, input_size=args.input_size, window_size=args.window_size, mean_df=mean_df, std_df=std_df)\n", "normal_dataset4 = TagDataset(df=normal_df4, input_size=args.input_size, window_size=args.window_size, mean_df=mean_df, std_df=std_df)\n", "abnormal_dataset1 = TagDataset(df=abnormal_df1, input_size=args.input_size, window_size=args.window_size, mean_df=mean_df, std_df=std_df)\n", "abnormal_dataset2 = TagDataset(df=abnormal_df2, input_size=args.input_size, window_size=args.window_size, mean_df=mean_df, std_df=std_df)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "## Data Loader 형태로 변환\n", "train_loader = torch.utils.data.DataLoader(\n", " dataset=normal_dataset1,\n", " batch_size=args.batch_size,\n", " shuffle=True)\n", "valid_loader = torch.utils.data.DataLoader(\n", " dataset=normal_dataset2,\n", " batch_size=args.batch_size,\n", " shuffle=False)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LSTMAutoEncoder(\n", " (encoder): Encoder(\n", " (lstm): LSTM(40, 10, num_layers=2, batch_first=True, dropout=0.1)\n", " )\n", " (reconstruct_decoder): Decoder(\n", " (lstm): LSTM(40, 10, num_layers=2, batch_first=True, dropout=0.1)\n", " (relu): ReLU()\n", " (fc): Linear(in_features=10, out_features=40, bias=True)\n", " )\n", ")" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## 모델 생성\n", "model = LSTMAutoEncoder(input_dim=args.input_size, latent_dim=args.latent_size, window_size=args.window_size, num_layers=args.num_layers)\n", "model.to(args.device)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "baf92e0b69814b36aad366d6bd838cc0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b98b5dc0e6784b699a5825ce45f257f1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='training', max=1126.0, style=ProgressStyle(description_wi…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0a254695e7e84e659415979aa4ebf72b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b2b55483a2404fa3ada59fc1a2ef0e90", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='training', max=1126.0, style=ProgressStyle(description_wi…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b3f1d07f4d4549e098e5a28ffa474728", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c435e541b439416197af9aa5ee10ec0d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='training', max=1126.0, style=ProgressStyle(description_wi…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9ef8646c175f48f0827a13c755899465", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "54f095ac043e48748a195e34694dc7fb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='training', max=1126.0, style=ProgressStyle(description_wi…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "19c7f6ffcfd44154935c617fe0894831", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "33f66ba4a2a94900a673694998d2ece5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='training', max=1126.0, style=ProgressStyle(description_wi…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6e83e0f899d94abc9830fd1a219959fd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f372b376aae341efb01922cdb84e9068", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='training', max=1126.0, style=ProgressStyle(description_wi…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c4cddf702a03426599fb493fcf84f8cc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "74135c07c3db48c489b9bfc1dfbb985b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='training', max=1126.0, style=ProgressStyle(description_wi…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d0045daf9a2b49efa66f977a5dd0f545", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fdc3f9ad8d6b4868a376c755cc217e6d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='training', max=1126.0, style=ProgressStyle(description_wi…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "00b9e61034484f7b938468c178251954", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "early stop condition best_loss[0.10483759593056596] eval_loss[0.10486975242817624]\n", "\n" ] } ], "source": [ "## 학습하기\n", "model = run(args, model, train_loader, valid_loader)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reconstruction Error 평균 공분산 계산" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1c9cc7cb051744fa9de241fcd75e800a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=161.0, style=ProgressStyle(description_widt…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\program files\\python37\\lib\\site-packages\\torch\\nn\\_reduction.py:44: UserWarning: size_average and reduce args will be deprecated, please use reduction='none' instead.\n", " warnings.warn(warning.format(ret))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "## Loss를 구하기\n", "loss_list = get_loss_list(args, model, valid_loader)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "## Reconstruction Error의 평균과 Covarinace 계산\n", "mean = np.mean(loss_list, axis=0)\n", "std = np.cov(loss_list.T)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "## Anomaly Score\n", "class Anomaly_Calculator:\n", " def __init__(self, mean:np.array, std:np.array):\n", " assert mean.shape[0] == std.shape[0] and mean.shape[0] == std.shape[1], '평균과 분산의 차원이 똑같아야 합니다.'\n", " self.mean = mean\n", " self.std = std\n", " \n", " def __call__(self, recons_error:np.array):\n", " x = (recons_error-self.mean)\n", " return np.matmul(np.matmul(x, self.std), x.T)\n", "\n", "## 비정상 점수 계산기\n", "anomaly_calculator = Anomaly_Calculator(mean, std)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5b135b19ff6a4914a43e1985b7750e91", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=20581.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "평균[0.10723994875158432], 중간[0.05052389362104756], 최소[0.004605710808467282], 최대[6.393770241747733]\n" ] } ], "source": [ "## Threshold 찾기\n", "anomaly_scores = []\n", "for temp_loss in tqdm(loss_list):\n", " temp_score = anomaly_calculator(temp_loss)\n", " anomaly_scores.append(temp_score)\n", "\n", "## 정상구간에서 비정상 점수 분포\n", "print(\"평균[{}], 중간[{}], 최소[{}], 최대[{}]\".format(np.mean(anomaly_scores), np.median(anomaly_scores), np.min(anomaly_scores), np.max(anomaly_scores)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 전체 데이터 시각화" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "anomaly_calculator = Anomaly_Calculator(mean, std)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6ab3292c32e64553b50a3deb39c5f118", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='testing', max=1722.0, style=ProgressStyle(description_wid…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "146f3d1449024d01a7d15bb07913ef11", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=220318.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "## 전체 데이터 불러오기\n", "total_dataset = TagDataset(df=df, input_size=args.input_size, window_size=args.window_size, mean_df=mean_df, std_df=std_df)\n", "total_dataloader = torch.utils.data.DataLoader(dataset=total_dataset,batch_size=args.batch_size,shuffle=False)\n", "\n", "## Reconstruction Loss를 계산하기\n", "total_loss = get_loss_list(args, model, total_dataloader)\n", "\n", "## 이상치 점수 계산하기\n", "anomaly_scores = []\n", "for temp_loss in tqdm(total_loss):\n", " temp_score = anomaly_calculator(temp_loss)\n", " anomaly_scores.append(temp_score)\n", "\n", "visualization_df = total_dataset.df\n", "visualization_df['score'] = anomaly_scores\n", "visualization_df['recons_error'] = total_loss.sum(axis=1)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "## 시각화 하기\n", "fig = plt.figure(figsize=(16, 6))\n", "ax=fig.add_subplot(111)\n", "\n", "## 불량 구간 탐색 데이터\n", "labels = visualization_df['machine_status'].values.tolist()\n", "dates = visualization_df.index\n", "\n", "\n", "visualization_df['score'].plot(ax=ax)\n", "ax.legend(['abnormal score'], loc='upper right')\n", "\n", "## 고장구간 표시\n", "temp_start = dates[0]\n", "temp_date = dates[0]\n", "temp_label = labels[0]\n", "\n", "for xc, value in zip(dates, labels):\n", " if temp_label != value:\n", " if temp_label == \"BROKEN\":\n", " ax.axvspan(temp_start, temp_date, alpha=0.2, color='blue')\n", " if temp_label == \"RECOVERING\":\n", " ax.axvspan(temp_start, temp_date, alpha=0.2, color='orange')\n", " temp_start=xc\n", " temp_label=value\n", " temp_date = xc\n", "if temp_label == \"BROKEN\":\n", " ax.axvspan(temp_start, xc, alpha=0.2, color='blue')\n", "if temp_label == \"RECOVERING\":\n", " ax.axvspan(temp_start, xc, alpha=0.2, color='orange')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "## 시각화 하기\n", "fig = plt.figure(figsize=(16, 6))\n", "ax=fig.add_subplot(111)\n", "\n", "## 불량 구간 탐색 데이터\n", "labels = visualization_df['machine_status'].values.tolist()\n", "dates = visualization_df.index\n", "\n", "\n", "visualization_df['recons_error'].plot(ax=ax)\n", "ax.legend(['reconstruction error'], loc='upper right')\n", "\n", "## 고장구간 표시\n", "temp_start = dates[0]\n", "temp_date = dates[0]\n", "temp_label = labels[0]\n", "\n", "for xc, value in zip(dates, labels):\n", " if temp_label != value:\n", " if temp_label == \"BROKEN\":\n", " ax.axvspan(temp_start, temp_date, alpha=0.2, color='blue')\n", " if temp_label == \"RECOVERING\":\n", " ax.axvspan(temp_start, temp_date, alpha=0.2, color='orange')\n", " temp_start=xc\n", " temp_label=value\n", " temp_date = xc\n", "if temp_label == \"BROKEN\":\n", " ax.axvspan(temp_start, xc, alpha=0.2, color='blue')\n", "if temp_label == \"RECOVERING\":\n", " ax.axvspan(temp_start, xc, alpha=0.2, color='orange')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }